{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: \"Abhishek's Features\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on Abhishek Thakur's features published on [GitHub](https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question) and [Kaggle forum](https://www.kaggle.com/c/quora-question-pairs/discussion/31284)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fuzzywuzzy import fuzz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.stats import skew, kurtosis\n",
    "from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_list_id = '3rdparty_abhishek'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The path to the saved GoogleNews Word2Vec model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "google_news_model_path = os.path.join(project.aux_dir, 'word2vec', 'GoogleNews-vectors-negative300.bin.gz')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Original question datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('').drop(['id', 'qid1', 'qid2'], axis=1)\n",
    "df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('').drop(['test_id'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Raw implementations from Abhishek below (excluding the features we already have in other notebooks):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wmd(model, s1, s2):\n",
    "    s1 = str(s1).lower().split()\n",
    "    s2 = str(s2).lower().split()\n",
    "    stop_words = stopwords.words('english')\n",
    "    s1 = [w for w in s1 if w not in stop_words]\n",
    "    s2 = [w for w in s2 if w not in stop_words]\n",
    "    return model.wmdistance(s1, s2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def norm_wmd(model, s1, s2):\n",
    "    s1 = str(s1).lower().split()\n",
    "    s2 = str(s2).lower().split()\n",
    "    stop_words = stopwords.words('english')\n",
    "    s1 = [w for w in s1 if w not in stop_words]\n",
    "    s2 = [w for w in s2 if w not in stop_words]\n",
    "    return model.wmdistance(s1, s2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sent2vec(model, s):\n",
    "    words = s.lower()\n",
    "    words = word_tokenize(words)\n",
    "    words = [w for w in words if not w in stop_words]\n",
    "    words = [w for w in words if w.isalpha()]\n",
    "    M = []\n",
    "    for w in words:\n",
    "        try:\n",
    "            M.append(model[w])\n",
    "        except:\n",
    "            continue\n",
    "    M = np.array(M)\n",
    "    v = M.sum(axis=0)\n",
    "    return v / np.sqrt((v ** 2).sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extend_with_features(data):\n",
    "    data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)\n",
    "    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)\n",
    "    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)\n",
    "\n",
    "    model = gensim.models.KeyedVectors.load_word2vec_format(google_news_model_path, binary=True)\n",
    "    data['wmd'] = data.apply(lambda x: wmd(model, x['question1'], x['question2']), axis=1)\n",
    "\n",
    "    norm_model = gensim.models.KeyedVectors.load_word2vec_format(google_news_model_path, binary=True)\n",
    "    norm_model.init_sims(replace=True)\n",
    "    data['norm_wmd'] = data.apply(lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1)\n",
    "\n",
    "    question1_vectors = np.zeros((data.shape[0], 300))\n",
    "    for i, q in progressbar(enumerate(data.question1.values), total=len(data)):\n",
    "        question1_vectors[i, :] = sent2vec(model, q)\n",
    "\n",
    "    question2_vectors  = np.zeros((data.shape[0], 300))\n",
    "    for i, q in progressbar(enumerate(data.question2.values), total=len(data)):\n",
    "        question2_vectors[i, :] = sent2vec(model, q)\n",
    "\n",
    "    question1_vectors = np.nan_to_num(question1_vectors)\n",
    "    question2_vectors = np.nan_to_num(question2_vectors)\n",
    "    \n",
    "    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]\n",
    "\n",
    "    data['skew_q1vec'] = [skew(x) for x in question1_vectors]\n",
    "    data['skew_q2vec'] = [skew(x) for x in question2_vectors]\n",
    "    data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors]\n",
    "    data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 404290/404290 [02:22<00:00, 2838.59it/s]\n",
      "100%|██████████| 404290/404290 [02:20<00:00, 2876.60it/s]\n"
     ]
    }
   ],
   "source": [
    "extend_with_features(df_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2345796/2345796 [10:27<00:00, 3736.96it/s]\n",
      "100%|██████████| 2345796/2345796 [10:06<00:00, 3868.06it/s]\n"
     ]
    }
   ],
   "source": [
    "extend_with_features(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.drop(['is_duplicate', 'question1', 'question2'], axis=1, inplace=True)\n",
    "df_test.drop(['question1', 'question2'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build final features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = np.array(df_train.values, dtype='float64')\n",
    "X_test = np.array(df_test.values, dtype='float64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train: (404290, 16)\n",
      "X_test:  (2345796, 16)\n"
     ]
    }
   ],
   "source": [
    "print('X_train:', X_train.shape)\n",
    "print('X_test: ', X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>common_words</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>4.511586</td>\n",
       "      <td>3.098622</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>41.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fuzz_qratio</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>61.951198</td>\n",
       "      <td>18.374723</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>47.000000</td>\n",
       "      <td>61.000000</td>\n",
       "      <td>77.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fuzz_WRatio</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>76.500195</td>\n",
       "      <td>15.293681</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>66.000000</td>\n",
       "      <td>85.000000</td>\n",
       "      <td>86.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wmd</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>inf</td>\n",
       "      <td>nan</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.299135</td>\n",
       "      <td>2.063220</td>\n",
       "      <td>2.882339</td>\n",
       "      <td>inf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>norm_wmd</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>inf</td>\n",
       "      <td>nan</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.466978</td>\n",
       "      <td>0.762440</td>\n",
       "      <td>1.053329</td>\n",
       "      <td>inf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cosine_distance</th>\n",
       "      <td>402512.000000</td>\n",
       "      <td>0.268454</td>\n",
       "      <td>0.210210</td>\n",
       "      <td>-0.000000</td>\n",
       "      <td>0.113917</td>\n",
       "      <td>0.221438</td>\n",
       "      <td>0.366977</td>\n",
       "      <td>1.120409</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cityblock_distance</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>9.180679</td>\n",
       "      <td>4.318149</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.602948</td>\n",
       "      <td>9.215218</td>\n",
       "      <td>11.884559</td>\n",
       "      <td>20.709869</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>jaccard_distance</th>\n",
       "      <td>403768.000000</td>\n",
       "      <td>0.929331</td>\n",
       "      <td>0.256268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>canberra_distance</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>131.848440</td>\n",
       "      <td>48.919782</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>112.812138</td>\n",
       "      <td>138.670967</td>\n",
       "      <td>161.453252</td>\n",
       "      <td>300.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>euclidean_distance</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>0.663576</td>\n",
       "      <td>0.311963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.477118</td>\n",
       "      <td>0.666134</td>\n",
       "      <td>0.858627</td>\n",
       "      <td>1.496936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>minkowski_distance</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>0.299342</td>\n",
       "      <td>0.140796</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.215219</td>\n",
       "      <td>0.300415</td>\n",
       "      <td>0.387251</td>\n",
       "      <td>0.682163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>braycurtis_distance</th>\n",
       "      <td>403768.000000</td>\n",
       "      <td>0.370718</td>\n",
       "      <td>0.200331</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.245800</td>\n",
       "      <td>0.352750</td>\n",
       "      <td>0.474889</td>\n",
       "      <td>1.136902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>skew_q1vec</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>0.010076</td>\n",
       "      <td>0.135183</td>\n",
       "      <td>-0.669451</td>\n",
       "      <td>-0.080358</td>\n",
       "      <td>0.011267</td>\n",
       "      <td>0.101131</td>\n",
       "      <td>0.675833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>skew_q2vec</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>0.010130</td>\n",
       "      <td>0.134776</td>\n",
       "      <td>-0.669451</td>\n",
       "      <td>-0.080048</td>\n",
       "      <td>0.011210</td>\n",
       "      <td>0.100334</td>\n",
       "      <td>0.781670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kur_q1vec</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>-0.060690</td>\n",
       "      <td>0.308918</td>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-0.239165</td>\n",
       "      <td>-0.082283</td>\n",
       "      <td>0.101566</td>\n",
       "      <td>2.288984</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kur_q2vec</th>\n",
       "      <td>404290.000000</td>\n",
       "      <td>-0.061898</td>\n",
       "      <td>0.306482</td>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-0.240417</td>\n",
       "      <td>-0.083331</td>\n",
       "      <td>0.100070</td>\n",
       "      <td>2.227666</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            count       mean       std       min        25%  \\\n",
       "common_words        404290.000000   4.511586  3.098622  0.000000   2.000000   \n",
       "fuzz_qratio         404290.000000  61.951198 18.374723  0.000000  47.000000   \n",
       "fuzz_WRatio         404290.000000  76.500195 15.293681  0.000000  66.000000   \n",
       "wmd                 404290.000000        inf       nan  0.000000   1.299135   \n",
       "norm_wmd            404290.000000        inf       nan  0.000000   0.466978   \n",
       "cosine_distance     402512.000000   0.268454  0.210210 -0.000000   0.113917   \n",
       "cityblock_distance  404290.000000   9.180679  4.318149  0.000000   6.602948   \n",
       "jaccard_distance    403768.000000   0.929331  0.256268  0.000000   1.000000   \n",
       "canberra_distance   404290.000000 131.848440 48.919782  0.000000 112.812138   \n",
       "euclidean_distance  404290.000000   0.663576  0.311963  0.000000   0.477118   \n",
       "minkowski_distance  404290.000000   0.299342  0.140796  0.000000   0.215219   \n",
       "braycurtis_distance 403768.000000   0.370718  0.200331  0.000000   0.245800   \n",
       "skew_q1vec          404290.000000   0.010076  0.135183 -0.669451  -0.080358   \n",
       "skew_q2vec          404290.000000   0.010130  0.134776 -0.669451  -0.080048   \n",
       "kur_q1vec           404290.000000  -0.060690  0.308918 -3.000000  -0.239165   \n",
       "kur_q2vec           404290.000000  -0.061898  0.306482 -3.000000  -0.240417   \n",
       "\n",
       "                           50%        75%        max  \n",
       "common_words          4.000000   6.000000  41.000000  \n",
       "fuzz_qratio          61.000000  77.000000 100.000000  \n",
       "fuzz_WRatio          85.000000  86.000000 100.000000  \n",
       "wmd                   2.063220   2.882339        inf  \n",
       "norm_wmd              0.762440   1.053329        inf  \n",
       "cosine_distance       0.221438   0.366977   1.120409  \n",
       "cityblock_distance    9.215218  11.884559  20.709869  \n",
       "jaccard_distance      1.000000   1.000000   1.000000  \n",
       "canberra_distance   138.670967 161.453252 300.000000  \n",
       "euclidean_distance    0.666134   0.858627   1.496936  \n",
       "minkowski_distance    0.300415   0.387251   0.682163  \n",
       "braycurtis_distance   0.352750   0.474889   1.136902  \n",
       "skew_q1vec            0.011267   0.101131   0.675833  \n",
       "skew_q2vec            0.011210   0.100334   0.781670  \n",
       "kur_q1vec            -0.082283   0.101566   2.288984  \n",
       "kur_q2vec            -0.083331   0.100070   2.227666  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.describe().T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = [\n",
    "    'abh_common_words',\n",
    "    'abh_fuzz_qratio',\n",
    "    'abh_fuzz_WRatio',\n",
    "    'abh_wmd',\n",
    "    'abh_norm_wmd',\n",
    "    'abh_cosine_distance',\n",
    "    'abh_cityblock_distance',\n",
    "    'abh_jaccard_distance',\n",
    "    'abh_canberra_distance',\n",
    "    'abh_euclidean_distance',\n",
    "    'abh_minkowski_distance',\n",
    "    'abh_braycurtis_distance',\n",
    "    'abh_skew_q1vec',\n",
    "    'abh_skew_q2vec',\n",
    "    'abh_kur_q1vec',\n",
    "    'abh_kur_q2vec',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.save_features(X_train, X_test, feature_names, feature_list_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
